
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
import pandas as pd
import numpy as np
# Plot libraries
import matplotlib
from matplotlib import transforms, pyplot as plt
import seaborn as sns
import bokeh
import plotly.express as px
import plotly.graph_objects as go
# Data preparation
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, ADASYN
# Cross validation
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, PredefinedSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Models
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
# Models score
from sklearn.metrics import roc_auc_score, roc_curve
%matplotlib inline
%config Completer.use_jedi = False
# configure plot font family to Arial
plt.rcParams['font.family'] = 'Arial'
# configure mathtext bold and italic font family to Arial
matplotlib.rcParams['mathtext.fontset'] = 'custom'
matplotlib.rcParams['mathtext.bf'] = 'Arial:bold'
matplotlib.rcParams['mathtext.it'] = 'Arial:italic'
# define colors
GRAY1, GRAY2, GRAY3 = '#231F20', '#414040', '#555655'
GRAY4, GRAY5, GRAY6 = '#646369', '#76787B', '#828282'
GRAY7, GRAY8, GRAY9 = '#929497', '#A6A6A5', '#BFBEBE'
BLUE1, BLUE2, BLUE3, BLUE4 = '#174A7E', '#4A81BF', '#94B2D7', '#94AFC5'
RED1, RED2 = '#C3514E', '#E6BAB7'
GREEN1, GREEN2 = '#0C8040', '#9ABB59'
ORANGE1 = '#F79747'
df = pd.read_csv('data/train.csv')
df.head()
We will firstly dropt the ID variable has they have no valuable information to train the model
df = df.drop('ID', axis=1)
df.shape
There are 76020 rows (observations) and 371 variables.
df.dtypes.value_counts()
All variables are numeric, being 260 integers and 111 as float.
It important to see if there is any variable with NA values.
df.isna().sum().sum()
There are no NA values in the dataset.
Another impotant part is to evaluate if there is any duplicated row in the data set.
df.duplicated().sum()
There are 4807 duplicated rows. We will need to drop them.
df.drop_duplicates(inplace=True)
df.shape
Lets take a look to see if some of these varaibles might be categorical. First we will calculate the number of unique values per column.
counts = df.nunique()
counts_df = pd.DataFrame({'variable':counts.index, 'count':counts.values})
counts_df.groupby('count').count()
It can be seen that 34 variables have constant values, not adding any relevant info to the data
# Take variables with no variability
df2 = df[df.columns[df.nunique() != 1]]
df2.shape
The number of variables is now 336.
Another option to drop some variables is to see if there is a perfect negative or positive correlation between them. In order to do that, we calculate the matrix corelation, select the upper part and drop the columns which the abs(corr) == 1.
# Get matrix corelation
mtcorr = df2.corr().abs()
mtcorr
# Selecting the upper part of the correlation matrix trianguel
upper = mtcorr.where(np.triu(np.ones(mtcorr.shape), k=1).astype(np.bool_))
upper
# Choose columns where there is a absolute correlation of 1
col_drop = [ col for col in mtcorr.columns if any(upper[col] == 1) ]
col_drop
# Dropping perfectly correlated variables
df3 = df2.drop(df2[col_drop], axis=1)
df3.shape
final_col = df3.columns
We were able to reduce the number of colums from 371 to 306, and the number of rows from 76020 to 71213.
label_count = pd.DataFrame(df3.TARGET.replace({0:'Satisfied', 1: 'Not Satisfied'}).value_counts())
label_count.rename(columns = {'TARGET': 'COUNT'}, inplace=True)
label_count['PERC'] = round(label_count.COUNT*100/sum(label_count.COUNT),2)
label_count
fig = px.bar(x = label_count.PERC,
y = label_count.index,
template = 'simple_white',
color = label_count.index,
color_discrete_map = {'Satisfied' : GRAY9,
'Not Satisfied': BLUE1},
opacity = 0.7,
orientation='h',
labels = {'x':'Percentage',
'y':'TARGET'},
title = 'TARGET')
fig.update_layout(showlegend=False)
fig.update_layout(
font_family="Arial",
font_size = 20,
font_color= GRAY6,
title_font_family="Arial Bold",
title_font_size = 25,
title_font_color= GRAY3,
legend_title_font_color=GRAY6,
xaxis = {"ticksuffix": " %",
'side': "top"}
)
fig.update_layout(yaxis={'visible': True, 'showticklabels': True, 'title':'', 'linecolor': GRAY6},
xaxis={'visible': True, 'showticklabels': True, 'title':'', 'linecolor':GRAY6})
fig.show()
We can see that the target variable is as a categorical type and that both categories are not balanced. Being 0 to satisfied clients and 1 to non-satisfied clients. The bar plot improved the visulization and it can be seen that there is only 3.95% of 'Not Satisfied' clients
df3.TARGET = df3.TARGET.astype('category')
X = df3.drop('TARGET', axis = 1)
y = df3.TARGET
# Set aside 25% for test/validation data for evaluation
X_train, X_vali_test, y_train, y_vali_test = train_test_split(X, y,
test_size=0.25,
shuffle= True)
# Set aside 15 % for validation and 10% for test
X_vali, X_test, y_vali, y_test = train_test_split(X_vali_test, y_vali_test,
test_size=0.40,
shuffle= True)
print('------ Data ------ ')
print('X_train shape: {}'.format(X_train.shape))
print('X_vali shape: {}'.format(X_vali.shape))
print('X_test shape: {}'.format(X_test.shape))
print('\n------ Label ------ ')
print('y_train shape: {}'.format(y_train.shape))
print('y_vali shape: {}'.format(y_vali.shape))
print('y_test shape: {}'.format(y_test.shape))
y_train.value_counts()
X_res, y_res = SMOTE( n_jobs = -1).fit_resample(X_train, y_train)
y_res.value_counts()
There are several techniques to decompose the attributes into a smaller subset. These can be useful for data exploration, visualization or to build predictive models or clustering. Because our current dataframe has a lot of variables, we will build a pipeline do reduce the dimensionality of the data.
The following PCA method will be used to reduce the dimensionality of the data.
The PCA (Principal Component Analysis) is one the main methods to reduce the dimensionality of the data. It uses a matrix and linearly combines multiple columns from the original data in order to maximize its variance. Each PCA is orthogonal relative to the other PCA's and are order by it strength to explain variance.
pca_pipe = Pipeline([('std', StandardScaler()),
('pca', PCA())])
train_pca = pca_pipe.fit_transform(X_res)
pca_number = pca_pipe.named_steps['pca'].explained_variance_ratio_
fig = go.Figure()
fig.add_trace(go.Scatter(
x = np.arange(pca_number.shape[0]),
y = np.cumsum(pca_number),
mode = 'lines',
line = dict( color = BLUE2,
width = 4)))
# Edit the layout
fig.update_layout(title='PCA',
xaxis_title="Number of PCA's",
yaxis_title='Explained Variance',
template = 'simple_white',
font_family="Arial",
font_size = 20,
font_color= GRAY6,
title_font_family="Arial Bold",
title_font_size = 25,
title_font_color= GRAY3,
legend_title_font_color=GRAY6)
fig.show()
From the interactive plot it is possible to see that with less than 30% of the the variables we can explain 95% of the total variance. Lets train the PCA again considering as a PCA number a 95% variance explained cut-off.
pca_pipe = Pipeline([('std', StandardScaler()),
('pca', PCA(n_components=0.95))])
train_pca = pca_pipe.fit_transform(X_res)
train_pca.shape
vali_pca = pca_pipe.transform(X_vali)
test_pca = pca_pipe.transform(X_test)
lr = LogisticRegression( solver = 'liblinear',
max_iter= 2500)
%%time
# train model
lr.fit(train_pca, y_res)
y_res.shape
print('Logistic Regression')
print(f'Train AUC score: {round(roc_auc_score(y_res, lr.predict_proba(train_pca)[:, 1]),2)}')
print(f'Vali AUC score: {round(roc_auc_score(y_vali, lr.predict_proba(vali_pca)[:, 1]),2)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, lr.predict_proba(test_pca)[:, 1]),2)}')
# Inicialize dictionary to append fpr, tpr and roc_auc
def calculate_roc_fpr_tpr(train_data, test_data, valid_data, model):
fpr = dict()
tpr = dict()
roc_auc = dict()
# Calcute for train
y_pred_proba_train = model.predict_proba(train_data)[:,1]
fpr['train'], tpr['train'], _ = roc_curve(y_res, y_pred_proba_train)
roc_auc['train'] = roc_auc_score(y_res, y_pred_proba_train)
# Calculat for test
y_pred_proba_test = model.predict_proba(test_data)[:,1]
fpr['test'], tpr['test'], _ = roc_curve(y_test, y_pred_proba_test)
roc_auc['test'] = roc_auc_score(y_test, y_pred_proba_test)
# Calculat for validation
y_pred_proba_vali = model.predict_proba(valid_data)[:,1]
fpr['vali'], tpr['vali'], _ = roc_curve(y_vali, y_pred_proba_vali)
roc_auc['vali'] = roc_auc_score(y_vali, y_pred_proba_vali)
return fpr, tpr, roc_auc
fpr, tpr, roc_auc = calculate_roc_fpr_tpr(train_pca, test_pca, vali_pca, model = lr)
def plot_auc_curve(fpr, tpr, roc_auc, title_model = ''):
fig = go.Figure()
fig.add_shape(
type='line', line=dict(dash='dash', color = GRAY9),
x0=0, x1=1, y0=0, y1=1
)
fig.add_trace(go.Scatter(x=fpr['train'], y=tpr['train'],
name='train - AUC ='+str(round(roc_auc['train'], 2)),
mode='lines',
line = dict( color = BLUE2, width = 4)))
fig.add_trace(go.Scatter(x=fpr['test'], y=tpr['test'],
name='test - AUC = '+str(round(roc_auc['test'], 2)),
mode='lines',
line = dict( color = ORANGE1, width = 4)))
fig.add_trace(go.Scatter(x=fpr['vali'], y=tpr['vali'],
name='vali - AUC = '+str(round(roc_auc['vali'], 2)),
mode='lines',
line = dict( color = RED1, width = 4)))
fig.update_layout(
title='AUC ' + title_model,
xaxis_title='False Positive Rate',
yaxis_title='True Positive Rate',
template = "plotly_white",
width=800, height=600,
font_family="Arial",
font_size = 20,
font_color= GRAY6,
title_font_family="Arial Bold",
title_font_size = 25,
title_font_color= GRAY3,
legend_title_font_color=GRAY6
)
fig.update_layout(legend=dict(
yanchor="top",
y=0.3,
xanchor="left",
x=0.6
))
fig.show()
plot_auc_curve(fpr, tpr, roc_auc, title_model = 'Logistic Regression')
It can be seen that the performance on train data is greatly better when compared with the test set. In order to improve model we will perform the tuning.
%%time
# define parameters
solvers = ['sag']
penalty = ['l2']
c_values = np.logspace(0, 3, 25)
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = StratifiedKFold(n_splits=5,
shuffle = True)
lr_grid = GridSearchCV(estimator=LogisticRegression(tol= 1e-3, max_iter= 5000),
param_grid=grid,
n_jobs=-1,
cv=cv,
scoring= 'roc_auc',
verbose = 2)
lr_result = lr_grid.fit(train_pca, y_res)
lr_grid.best_score_
lr_grid.best_params_
print('Logistic Regression')
print(f'Train AUC score: {round(roc_auc_score(y_res, lr_grid.predict_proba(train_pca)[:, 1]),3)}')
print(f'Vali AUC score: {round(roc_auc_score(y_vali, lr_grid.predict_proba(vali_pca)[:, 1]),3)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, lr_grid.predict_proba(test_pca)[:, 1]),3)}')
We will try to use a specific validation test to be use on grid search.
split_index_pca = [-1]*len(train_pca) + [0]*len(X_vali)
X_new = np.concatenate((train_pca, vali_pca), axis=0)
y_new = np.concatenate((y_res, y_vali), axis=0)
print(X_new.shape)
print(y_new.shape)
%%time
cv_split_pca = PredefinedSplit(test_fold= split_index_pca)
lr_grid_split = GridSearchCV(estimator=LogisticRegression(tol= 1e-3, max_iter= 5000),
param_grid=grid,
n_jobs=-1,
cv=cv_split_pca,
scoring= 'roc_auc',
verbose = 2)
lr_grid_split.fit(X_new, y_new)
print(f'Best score: {lr_grid_split.best_score_}.')
print(f'Best parameters: {lr_grid.best_params_}')
Best parameters:
Best model training:
lr_best = LogisticRegression( C = 750,
penalty = 'l2',
solver = 'sag' ,
tol= 1e-3,
max_iter= 5000
)
lr_best.fit(train_pca, y_res)
print('Logistic Regression - Best Model')
print(f'Train AUC score: {round(roc_auc_score(y_res, lr_best.predict_proba(train_pca)[:, 1]), 3)}')
print(f'Vali AUC score: {round(roc_auc_score(y_vali, lr_best.predict_proba(vali_pca)[:, 1]),3)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, lr_best.predict_proba(test_pca)[:, 1]),3)}')
fpr, tpr, roc_auc = calculate_roc_fpr_tpr(train_pca, test_pca, vali_pca, model = lr_best)
plot_auc_curve(fpr, tpr, roc_auc, title_model= 'Logistic Regression Tunned')
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(train_pca, y_res)
print(f'Train AUC score: {round(roc_auc_score(y_res, nb.predict_proba(train_pca)[:, 1]),2)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, nb.predict_proba(test_pca)[:, 1]),2)}')
Because XGBoost doesn't really need for the features to be scaled and centered we will do a first try with pre-processing and without pre-processing.
# Supported tree methods are `gpu_hist`, `approx`, and `hist`.
clf = xgb.XGBClassifier(
tree_method="gpu_hist",
objective = 'binary:logistic', #binary:logitraw
eval_metric = 'auc',
sampling_method = 'gradient_based',
n_jobs = -1,
# Default Parameters
learning_rate = 0.3,
max_depth = 6,
colsample_bytree =1,
subsample = 1,
min_split_loss = 0,
min_child_weight = 1
)
# X is the dataframe we created in previous snippet
clf.fit(X_res, y_res)
print('XGBoost without PCA, Scale and Center')
print(f'Train AUC score: {round(roc_auc_score(y_res, clf.predict_proba(X_res)[:, 1]),3)}')
print(f'Validation AUC score: {round(roc_auc_score(y_vali, clf.predict_proba(X_vali)[:, 1]),3)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]),3)}')
# Supported tree methods are `gpu_hist`, `approx`, and `hist`.
clf_pre = xgb.XGBClassifier(
tree_method="gpu_hist",
objective = 'binary:logistic', #binary:logitraw
eval_metric = 'auc',
sampling_method = 'gradient_based',
n_jobs = -1,
# Default Parameters
learning_rate = 0.3,
max_depth = 6,
colsample_bytree =1,
subsample = 1,
min_split_loss = 0,
min_child_weight = 1
)
# X is the dataframe we created in previous snippet
clf_pre.fit(train_pca, y_res)
print('XGBoost with PCA, Scale and Center')
print(f'Train AUC score: {round(roc_auc_score(y_res, clf_pre.predict_proba(train_pca)[:, 1]),3)}')
print(f'Validation AUC score: {round(roc_auc_score(y_vali, clf_pre.predict_proba(vali_pca)[:, 1]),3)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, clf_pre.predict_proba(test_pca)[:, 1]),3)}')
The model that uses all variables without pre-processing presented better results, so we will tune the model in the next subchapter.
estimator = xgb.XGBClassifier(
tree_method="gpu_hist",
objective = 'binary:logistic',
eval_metric = 'auc',
sampling_method = 'gradient_based',
n_jobs = -1
)
parameters = {
'max_depth': range (2, 6, 1),
'n_estimators': range(50, 500, 50),
'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.3],
'colsample_bytree' : [0.8, 0.85, 0.9, 0.95, 1],
'colsample_bylevel': [0.8, 0.85, 0.9, 0.95, 1],
'subsample' : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
'min_split_loss' : [0, 0.5, 2, 5, 10, 15]
}
split_index = [-1]*len(X_res) + [0]*len(X_vali)
X_new2 = np.concatenate((X_res, X_vali), axis=0)
y_new2 = np.concatenate((y_res, y_vali), axis=0)
cv_split = PredefinedSplit(test_fold= split_index)
print(X_new2.shape)
print(y_new2.shape)
%%time
xgb_grid = RandomizedSearchCV(
estimator=estimator,
param_distributions=parameters,
n_iter = 1000,
scoring = 'roc_auc',
n_jobs = -1,
cv = cv_split,
verbose = 3
)
xgb_grid.fit(X_new2, y_new2)
xgb_grid.best_params_
The best model has the following parameters:
print('XGBoost - Tuned Model')
print(f'Train AUC score: {round(roc_auc_score(y_res, xgb_grid.predict_proba(X_res)[:, 1]),3)}')
print(f'Vali AUC score: {round(roc_auc_score(y_vali, xgb_grid.predict_proba(X_vali)[:, 1]),3)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, xgb_grid.predict_proba(X_test)[:, 1]),3)}')
fpr, tpr, roc_auc = calculate_roc_fpr_tpr(X_res, X_test, X_vali, model = xgb_grid)
plot_auc_curve(fpr, tpr, roc_auc, title_model= 'XGBoost Tunned')
Although the model is still overfitting, we we able to improve significantly the Validation and Test score.
import lightgbm as lgb
lgbm_class = lgb.LGBMClassifier(boosting_type ='gbdt',
max_depth= -1,
num_leaves= 31,
learning_rate= 0.1,
n_estimators= 100,
objective= 'binary',
min_child_samples= 20,
colsample_bytree= 1.0,
subsample= 1.0,
n_jobs= -1,
importance_type= 'split',
reg_alpha= 0.0,
reg_lambda= 0.0,
silent= 'warn'
)
lgbm_class.fit(X_res, y_res)
print('LightGBM')
print(f'Train AUC score: {round(roc_auc_score(y_res, lgbm_class.predict_proba(X_res)[:, 1]),3)}')
print(f'Validation AUC score: {round(roc_auc_score(y_vali, lgbm_class.predict_proba(X_vali)[:, 1]),3)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, lgbm_class.predict_proba(X_test)[:, 1]),3)}')
lgbm_estimator = lgb.LGBMClassifier(boosting_type ='gbdt',
objective= 'binary',
n_jobs= -1,
silent= 'warn',
importance_type= 'split'
)
parameters = {
'max_depth': range(2, 6, 1),
'num_leaves' : range(5, 50, 2),
'n_estimators': range(50, 500, 50),
'learning_rate': [0.001, 0.01, 0.03, 0.05, 0.075, 0.1, 0.3],
'min_child_samples' : np.arange(20, 100, 5),
'colsample_bytree' : np.arange(0.8, 1, 0.02),
'subsample' : np.arange(0.5, 1, 0.05),
'reg_alpha': np.arange(0, 1, 0.1),
'reg_lambda': np.arange(0, 1, 0.1)
}
%%time
light_grid = RandomizedSearchCV(
estimator=lgbm_estimator,
param_distributions=parameters,
n_iter = 10000,
scoring = 'roc_auc',
n_jobs = -1,
cv = cv_split,
verbose = 3
)
light_grid.fit(X_new2, y_new2)
print(f'Best LightGBM Score: {round(light_grid.best_score_,3 )}')
print(f'Best LightGBM Parameters: {light_grid.best_params_}')
Best LightGBM parameters:
print('LightGBM - Tunned')
print(f'Train AUC score: {round(roc_auc_score(y_res, light_grid.predict_proba(X_res)[:, 1]),3)}')
print(f'Validation AUC score: {round(roc_auc_score(y_vali, light_grid.predict_proba(X_vali)[:, 1]),3)}')
print(f'Test AUC score: {round(roc_auc_score(y_test, light_grid.predict_proba(X_test)[:, 1]),3)}')
fpr, tpr, roc_auc = calculate_roc_fpr_tpr(X_res, X_test, X_vali, model = light_grid)
plot_auc_curve(fpr, tpr, roc_auc, title_model= 'LightGBM Tunned')
test = pd.read_csv('data/test.csv')
test
test_id = test.ID
test_data = test[final_col.drop('TARGET')]
test_data.shape
df_submission = pd.DataFrame(data = {'ID' : test_id,
'TARGET' : light_grid.predict_proba(test_data)[:, 1]})
df_submission
df_submission.to_csv('submission_v1.csv', index = False)
The final predictions were submitted to the kaggle and the we got the following scores: